## Importing libraries for the work
import pandas as pd
import sys
import numpy as np
import matplotlib.pyplot
import seaborn as sns
## Importing the dataframe into Pandas
amazon = pd.read_excel(r"C:\Users\User Pc\Desktop\Datasets\Online sales Power Querry & R\Amazon store prepared data.xlsx")
amazon.head(3)
| Browser ID | name | main_category | sub_category | image | link | ratings | no_of_ratings | discount_price | actual_price | |
|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Lloyd 1.5 Ton 3 Star Inverter Split Ac (5 In 1... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/31UISB90sY... | https://www.amazon.in/Lloyd-Inverter-Convertib... | 4.2 | 2255.0 | 32,999 | 58,990 |
| 1 | 1 | LG 1.5 Ton 5 Star AI DUAL Inverter Split AC (C... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Convertible-Anti-Viru... | 4.2 | 2948.0 | 46,490 | 75,990 |
| 2 | 2 | LG 1 Ton 4 Star Ai Dual Inverter Split Ac (Cop... | appliances | Air Conditioners | https://m.media-amazon.com/images/I/51JFb7FctD... | https://www.amazon.in/LG-Inverter-Convertible-... | 4.2 | 1206.0 | 34,490 | 61,990 |
## Getting a general overview of all aspects of the data
amazon.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 551585 entries, 0 to 551584 Data columns (total 10 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Browser ID 551585 non-null int64 1 name 551585 non-null object 2 main_category 551585 non-null object 3 sub_category 551585 non-null object 4 image 551585 non-null object 5 link 551585 non-null object 6 ratings 369558 non-null float64 7 no_of_ratings 369558 non-null float64 8 discount_price 490422 non-null object 9 actual_price 533772 non-null object dtypes: float64(2), int64(1), object(7) memory usage: 42.1+ MB
## Getting an overview of the numeric aspects of the data
## The site had 551,585 browser-attempts
amazon.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| Browser ID | 551585.0 | 7006.200471 | 5740.835523 | 0.0 | 1550.0 | 5933.0 | 11482.0 | 19199.0 |
| ratings | 369558.0 | 3.832311 | 0.756102 | 1.0 | 3.5 | 3.9 | 4.3 | 5.0 |
| no_of_ratings | 369558.0 | 840.778698 | 8651.536018 | 1.0 | 4.0 | 20.0 | 133.0 | 589547.0 |
## Total no of browser-attempts on the site
amazon['Browser ID'].count()
551585
## Confirming if the Browser ID list contains duplicates, and was found so
amazon['Browser ID'].is_unique
False
## Identifying all the duplicates in the list
amazon.pivot_table(index=['Browser ID'], aggfunc='size').unique()
array([112, 109, 108, 107, 106, 105, 104, 102, 101, 100, 99, 98, 95,
93, 92, 91, 89, 86, 84, 78, 77, 74, 73, 70, 61, 60,
57, 55, 54, 50, 49, 47, 45, 44, 43, 41, 39, 37, 36,
33, 31, 30, 29, 28, 25, 18, 16, 11, 10, 6], dtype=int64)
## Total number of unique browsers on the site
len(amazon.pivot_table(index=['Browser ID'], aggfunc='size'))
19200
## Main categories by their traffic levels on the site
main_category_traffic_levels = amazon.groupby('main_category')['Browser ID'].count().sort_values(ascending=False)
main_category_traffic_levels[:10]
main_category accessories 116141 men's clothing 76656 women's clothing 76512 tv, audio & cameras 68659 men's shoes 57456 appliances 33096 stores 32903 home & kitchen 14568 kids' fashion 13488 sports & fitness 12648 Name: Browser ID, dtype: int64
## Visual rep. of Main category by their respective traffic levels
main_category_traffic_levels[:10].plot(kind='bar', color='brown')
<Axes: xlabel='main_category'>
## Main categories by their unique customer attempts on the site
main_category_by_unique_browserID = amazon.groupby('main_category')['Browser ID'].nunique().sort_values(ascending=False)
main_category_by_unique_browserID
main_category men's clothing 19200 men's shoes 19200 women's clothing 19200 stores 19200 accessories 19152 appliances 9600 tv, audio & cameras 9600 kids' fashion 2736 beauty & health 2640 bags & luggage 2208 women's shoes 2208 industrial supplies 1440 home & kitchen 1416 car & motorbike 1392 toys & baby products 1392 grocery & gourmet foods 1296 sports & fitness 1296 music 1080 pet supplies 984 home, kitchen, pets 24 Name: Browser ID, dtype: int64
## Visual rep. of Main category by their unique customer attempts
main_category_by_unique_browserID[:10].plot(kind='bar', grid=True, color='green')
<Axes: xlabel='main_category'>
## Main category products with the highest ratings on the site
main_category_ratings= amazon.groupby('main_category',
as_index=False)['ratings'].sum().sort_values(by='ratings',
ascending=False)[:10]
main_category_ratings
| main_category | ratings | |
|---|---|---|
| 0 | accessories | 263218.9 |
| 18 | women's clothing | 241475.7 |
| 17 | tv, audio & cameras | 189407.5 |
| 10 | men's clothing | 158233.0 |
| 15 | stores | 111927.4 |
| 11 | men's shoes | 105521.0 |
| 1 | appliances | 105073.7 |
| 6 | home & kitchen | 52804.2 |
| 14 | sports & fitness | 33591.4 |
| 3 | beauty & health | 30908.1 |
## Viz of main_category by their total customer ratings of the products on the site
main_category_ratings.plot(kind='bar', x='main_category', y='ratings', color='purple')
<Axes: xlabel='main_category'>
## Preparing the data for the process
amazon['actual_price'].fillna(0, inplace=True)
## Contd data prepration for the correlation analysis
amazon['discount_price'].fillna(0, inplace=True)
## Contd data prepration for the correlation analysis
amazon['discount_price'] = amazon['discount_price'].replace('[^\d.]', '', regex=True).astype(float)
## Contd data prepration for the correlation analysis
amazon['actual_price'] = amazon['actual_price'].replace('[^\d.]', '', regex=True).astype(float)
## The correlation data to analyze variable relationships
corr_df = amazon.drop(['Browser ID', 'name', 'main_category', 'sub_category', 'image', 'link'], axis=1).corr(method='pearson')
## Correlation matrix
corr_df.style.background_gradient(cmap=\
'coolwarm')
| ratings | no_of_ratings | discount_price | actual_price | |
|---|---|---|---|---|
| ratings | 1.000000 | 0.040689 | 0.055439 | 0.076250 |
| no_of_ratings | 0.040689 | 1.000000 | 0.014449 | 0.017019 |
| discount_price | 0.055439 | 0.014449 | 1.000000 | 0.848990 |
| actual_price | 0.076250 | 0.017019 | 0.848990 | 1.000000 |
## As seen above, the result shows that there is
## An Inverse relationship between ratings, discount and actual price
## meaning; the lower the discount or actual price, the higher the rating and number of ratings a product gets
## There is also a strong positive relationship between discount and actual price as exoected
## meaning; the higher the actual price, the higher the discount amount on specific products